{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "e77e6470",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-27T14:16:29.004890Z",
     "start_time": "2024-06-27T14:16:28.985889Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from sklearn.model_selection import cross_val_predict\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from lightgbm import LGBMClassifier\n",
    "\n",
    "def get_fea_ppg(df):\n",
    "    df=df.groupby('recording_time')['PPG'].mean()\n",
    "    return pd.DataFrame(df)\n",
    "\n",
    "def get_fea_acc(df):\n",
    "    df=df.groupby('recording_time')[['Motion_dataX','Motion_dataY','Motion_dataZ']].mean()\n",
    "    return pd.DataFrame(df)\n",
    "\n",
    "def get_fea_gsr(df):\n",
    "    df=df.groupby('recording_time')['GSR'].mean()\n",
    "    return pd.DataFrame(df)\n",
    "\n",
    "def manual_feature(df):\n",
    "    feat = [\n",
    "        df_batch['Motion_dataX'].mean(),\n",
    "        df_batch['Motion_dataX'].max(),\n",
    "        df_batch['Motion_dataX'].min(),\n",
    "        df_batch['Motion_dataX'].max() - df_batch['Motion_dataX'].min(),\n",
    "        df_batch['Motion_dataX'].diff(1).mean(),\n",
    "        df_batch['Motion_dataX'].diff(1).max(),\n",
    "        df_batch['Motion_dataX'].diff(1).min(),\n",
    "\n",
    "        df_batch['Motion_dataY'].mean(),\n",
    "        df_batch['Motion_dataY'].max(),\n",
    "        df_batch['Motion_dataY'].min(),\n",
    "        df_batch['Motion_dataY'].max() - df_batch['Motion_dataY'].min(),\n",
    "        df_batch['Motion_dataY'].diff(1).mean(),\n",
    "        df_batch['Motion_dataY'].diff(1).max(),\n",
    "        df_batch['Motion_dataY'].diff(1).min(),\n",
    "\n",
    "        df_batch['Motion_dataZ'].mean(),\n",
    "        df_batch['Motion_dataZ'].max(),\n",
    "        df_batch['Motion_dataZ'].min(),\n",
    "        df_batch['Motion_dataZ'].max() - df_batch['Motion_dataZ'].min(),\n",
    "        df_batch['Motion_dataZ'].diff(1).mean(),\n",
    "        df_batch['Motion_dataZ'].diff(1).max(),\n",
    "        df_batch['Motion_dataZ'].diff(1).min(),\n",
    "\n",
    "        df_batch['GSR'].mean(),\n",
    "        df_batch['GSR'].max(),\n",
    "        df_batch['GSR'].min(),\n",
    "        df_batch['GSR'].max() - df_batch['GSR'].min(),\n",
    "        df_batch['GSR'].diff(1).mean(),\n",
    "        df_batch['GSR'].diff(1).max(),\n",
    "        df_batch['GSR'].diff(1).min(),\n",
    "        df_batch['GSR'].isnull().mean(),\n",
    "\n",
    "        df_batch['PPG'].mean(),\n",
    "        df_batch['PPG'].max(),\n",
    "        df_batch['PPG'].min(),\n",
    "        df_batch['PPG'].max() - df_batch['PPG'].min(),\n",
    "        df_batch['PPG'].diff(1).mean(),\n",
    "        df_batch['PPG'].diff(1).max(),\n",
    "        df_batch['PPG'].diff(1).min(),\n",
    "    ]\n",
    "    return feat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "0af0d82c",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-27T14:19:12.626293Z",
     "start_time": "2024-06-27T14:17:25.090204Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data0033 09:41:01 16:59:59\n",
      "data0129 06:00:00 16:59:59\n",
      "data0157 10:23:05 16:59:59\n",
      "data0172 10:43:28 16:59:59\n",
      "data0179 06:00:00 16:59:59\n",
      "data0191 08:23:27 16:59:59\n",
      "data0302 06:00:00 16:59:59\n",
      "data0369 09:10:05 16:59:59\n",
      "data0377 08:38:51 16:59:59\n",
      "data0418 06:00:00 16:59:59\n",
      "data0461 06:00:00 16:59:59\n",
      "data0541 06:00:00 16:59:59\n",
      "data0548 06:00:00 16:59:59\n",
      "data0562 09:11:31 16:59:59\n",
      "data0645 07:19:29 16:59:59\n",
      "data0667 06:00:00 16:59:59\n",
      "data0671 10:01:30 16:59:59\n",
      "data0699 06:00:00 16:59:59\n",
      "data0702 06:00:00 16:59:59\n",
      "data0856 06:00:00 16:59:59\n",
      "data0882 09:41:24 16:59:59\n",
      "data0943 06:00:00 16:24:55\n",
      "data0975 06:00:00 16:59:59\n",
      "data0982 11:10:27 16:59:59\n",
      "data0984 06:00:00 16:59:59\n",
      "data0999 06:00:00 16:59:59\n",
      "data1428 06:00:00 16:59:59\n",
      "data1576 06:00:00 16:59:59\n",
      "data2381 06:00:00 16:59:59\n",
      "data3862 06:00:00 16:59:59\n",
      "data4722 08:58:50 16:18:37\n",
      "data4998 09:26:17 16:59:59\n",
      "data5166 08:11:03 16:59:59\n",
      "data5468 06:00:00 16:59:59\n",
      "data7329 07:34:22 16:59:59\n",
      "data7431 06:00:00 16:59:59\n",
      "data9057 06:00:00 16:59:59\n",
      "data9339 06:00:00 16:59:59\n",
      "data9597 06:00:00 16:59:59\n"
     ]
    }
   ],
   "source": [
    "train_features = []\n",
    "list_=os.listdir('./training_data/')\n",
    "train_label = pd.read_csv('training_data/train_label.csv', encoding='gb2312')\n",
    "# 对训练集的个体\n",
    "for sid in list_:\n",
    "    if '.csv' in sid:\n",
    "        continue\n",
    "\n",
    "    # 三类观测数据\n",
    "    df_acc = get_fea_acc(pd.read_csv(f'./training_data/{sid}/ACC.csv'))\n",
    "    df_gsr = get_fea_gsr(pd.read_csv(f'./training_data/{sid}/GSR.csv'))\n",
    "    df_ppg = get_fea_ppg(pd.read_csv(f'./training_data/{sid}/PPG.csv'))\n",
    "\n",
    "    # 按照时间顺序，拼接三类观测数据\n",
    "    df = pd.concat([df_acc, df_gsr, df_ppg], axis=1).reset_index()\n",
    "    \n",
    "    df['GSR'] = df['GSR'].round(4)\n",
    "    df['GSR'] = df['GSR'].replace(0.0061, np.nan)\n",
    "    \n",
    "    label = train_label.set_index('文件名').loc[sid].values[0]\n",
    "\n",
    "    # 拆分为更小的数据\n",
    "    for idx in range(df.shape[0] // 3000):\n",
    "        df_batch = df.iloc[idx*3000: (idx+1)*3000]\n",
    "        feat = manual_feature(df_batch)\n",
    "        feat = [sid] + feat + [label]\n",
    "        train_features.append(feat)\n",
    "\n",
    "test_features = []\n",
    "for sid in os.listdir('./test_data/'):\n",
    "    if '.csv' in sid:\n",
    "        continue\n",
    "\n",
    "    df_acc = get_fea_acc(pd.read_csv(f'./test_data/{sid}/ACC.csv'))\n",
    "    df_gsr = get_fea_gsr(pd.read_csv(f'./test_data/{sid}/GSR.csv'))\n",
    "    df_ppg = get_fea_ppg(pd.read_csv(f'./test_data/{sid}/PPG.csv'))\n",
    "    df = pd.concat([df_acc, df_gsr, df_ppg], axis=1).reset_index()\n",
    "    \n",
    "    df['GSR'] = df['GSR'].round(4)\n",
    "    df['GSR'] = df['GSR'].replace(0.0061, np.nan)\n",
    "    \n",
    "    print(sid, df['recording_time'].min(), df['recording_time'].max())\n",
    "    for idx in range(df.shape[0] // 3000):\n",
    "        df_batch = df.iloc[idx*3000: (idx+1)*3000]\n",
    "        feat = manual_feature(df_batch)\n",
    "        feat = [sid] + feat\n",
    "        test_features.append(feat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "51c53aa7",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-27T14:20:20.151703Z",
     "start_time": "2024-06-27T14:20:19.986520Z"
    }
   },
   "outputs": [],
   "source": [
    "train_features = pd.DataFrame(train_features)\n",
    "test_features = pd.DataFrame(test_features)\n",
    "\n",
    "model = LGBMClassifier()\n",
    "model.fit(train_features.iloc[:, 1:-1], train_features.iloc[:, -1])\n",
    "test_pred = model.predict(test_features.iloc[:, 1:])\n",
    "\n",
    "test_features['label'] = test_pred\n",
    "\n",
    "pred = test_features.groupby(0)['label'].mean() > 0.5\n",
    "\n",
    "pred = pred.astype(int).reset_index()\n",
    "\n",
    "pred.to_csv('lgb1.csv', index=None, header=None)\n",
    "# 成绩：0.92308"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
